
from requests_html import HTMLSession
import pymysql
from urllib.request import urlretrieve
import base64
import random
import time
import datetime
MYSQL_CONFIG = {
    'HOST': '127.0.0.1',
    'PORT': 3306,
    'USER': 'root',
    'PASSWORD': 'yangtaoo',
    'DB': 'news',
    'CHARSET': 'utf8'
}

class Mysql( ):
    def __init__(self):
        self.host = MYSQL_CONFIG['HOST']
        self.port = MYSQL_CONFIG['PORT']
        self.user = MYSQL_CONFIG['USER']
        self.password = MYSQL_CONFIG['PASSWORD']
        self.db = MYSQL_CONFIG['DB']
        self.charset = MYSQL_CONFIG['CHARSET']

    def get_mysql_con(self):
        return pymysql.connect(
            host=self.host,
            port=self.port,
            user=self.user,
            password=self.password,
            db=self.db,
            charset=self.charset
        )

    # 插入新闻
    def insert_junshinews(self,DocID,Title,NodeId,PubTime,LinkUrl,Abstract,Editor,SourceName):
        con = self.get_mysql_con( )
        cur = con.cursor( )
        sql = "insert into junshi (DocID,Title,NodeId,PubTime,LinkUrl,Abstract,Editor,SourceName) values(%s,%s,%s,%s,%s,%s,%s,%s);"
        cur.execute(sql,(DocID,Title,NodeId,PubTime,LinkUrl,Abstract,Editor,SourceName))
        con.commit()

    # def insert_manhua(self,title,url):
    #     con = self.get_mysql_con( )
    #     cur = con.cursor( )
    #     sql = "insert into manhua (title,url) values(%s,%s);"
    #     cur.execute(sql,(title,url))
    #     con.commit()

    def insert_manhua(self,manhua_title,chapt_title,img):
        con = self.get_mysql_con( )
        cur = con.cursor( )
        rectime = datetime.datetime.now().strftime("%Y-%m-%d, %H:%M:%S")
        sql = "insert into manhua (manhua_title,chapt_title,rectime,img) values(%s,%s,%s,%s);"
        cur.execute(sql,(manhua_title,chapt_title,rectime,pymysql.Binary(img)))
        con.commit()

start_url = "http://www.imanhuaw.com"

if __name__=="__main__":
    print('start...')
    session = HTMLSession()
    mysql = Mysql()
    all_url = "https://www.imanhuaw.com/imanhua/all/"
    res = session.get(url=all_url,verify = False)
    print(res.encoding)
    total = res.html.xpath("//div[@class='NewPages']/ul/li[last()]/a/@href")[0].split('.')[0]
    print('总页数：'+str(total))
    print(total)
    for page in range(1,int(total)+1):
        page_url = all_url+str(page)+".html"
        print("第"+str(page)+"页")
        page_res = session.get(page_url)
        # /html/body/div[2]/div[2]/div/div[2]/ul/li[1]/div[2]/a
        manhua_urls = page_res.html.xpath("//div[@class='mh-works-info']/a/@href")
        # 遍历每页的url
        for url in manhua_urls:
            man_url = start_url+url
            man_res = session.get(man_url)
            manhua_title = man_res.html.xpath("//div[@class='mh-date-info-name']/h4/a/text()")[0]
            print(manhua_title)
            # 插入漫画数据
            # mysql.insert_manhua(title=manhua_title,url=man_url)
            chapts = man_res.html.xpath("//ul[@id='mh-chapter-list-ol-0']/li")
            # 章节遍历
            for chapt in chapts:
                chapt_title = chapt.xpath("//a/p/text()")[0]
                chapt_url = start_url+ chapt.xpath("//a/@href")[0]
                print(chapt_title,chapt_url)
                chapt_res = session.get(chapt_url)
                scripts = chapt_res.html.xpath("//script/text()")[1]
                for lin in scripts.splitlines():
                   if "qTcms_S_m_murl_e=" in lin:
                       encode_str = lin.split('"')[1].encode()
                       decode_str = base64.decodestring(encode_str).decode()
                       chapt_imgs = decode_str.split("$qingtiandy$")
                       # 章图片地址遍历，存储
                       for img_url in chapt_imgs:
                           img_res = session.get(img_url)
                           # 随机休眠，刚才应该是爬太快被封了
                           time.sleep(random.randint(1,2))
                           img = session.get(img_url).content
                           # 插入章节图片
                           mysql.insert_manhua(manhua_title=manhua_title,chapt_title=chapt_title,img=img)
                           print(chapt_title,img_url)
        print("第"+str(page)+"完")
    print("all done...")
